Use raw fastq and generate the quality plots to asses the quality of reads
Filter and trim out bad sequences and bases from our sequencing files
Write out fastq files with high quality sequences
Evaluate the quality from our filter and trim.
Infer errors on forward and reverse reads individually
Identified ASVs on forward and reverse reads separately using the error model.
Merge forward and reverse ASVs into “contigous ASVs”.
Generate ASV count table. (otu_table input for
phyloseq.).
ASV count table: otu_table
Taxonomy table tax_table
Sample information: sample_table track the reads
lost throughout DADA2 workflow.
#Set the raw fastq path to the raw sequencing files
#Path to the fastq files
raw_fastqs_path <- "data/01_DADA2/00_trimmed_fastq"
raw_fastqs_path## [1] "data/01_DADA2/00_trimmed_fastq"
## [1] "SRR17060816_trim_1.fq.gz" "SRR17060816_trim_2.fq.gz"
## [3] "SRR17060817_trim_1.fq.gz" "SRR17060817_trim_2.fq.gz"
## [5] "SRR17060818_trim_1.fq.gz" "SRR17060818_trim_2.fq.gz"
## [7] "SRR17060819_trim_1.fq.gz" "SRR17060819_trim_2.fq.gz"
## [9] "SRR17060820_trim_1.fq.gz" "SRR17060820_trim_2.fq.gz"
## [11] "SRR17060821_trim_1.fq.gz" "SRR17060821_trim_2.fq.gz"
## [13] "SRR17060822_trim_1.fq.gz" "SRR17060822_trim_2.fq.gz"
## [15] "SRR17060823_trim_1.fq.gz" "SRR17060823_trim_2.fq.gz"
## [17] "SRR17060824_trim_1.fq.gz" "SRR17060824_trim_2.fq.gz"
## [19] "SRR17060825_trim_1.fq.gz" "SRR17060825_trim_2.fq.gz"
## [21] "SRR17060826_trim_1.fq.gz" "SRR17060826_trim_2.fq.gz"
## [23] "SRR17060827_trim_1.fq.gz" "SRR17060827_trim_2.fq.gz"
## [25] "SRR17060828_trim_1.fq.gz" "SRR17060828_trim_2.fq.gz"
## [27] "SRR17060829_trim_1.fq.gz" "SRR17060829_trim_2.fq.gz"
## [29] "SRR17060830_trim_1.fq.gz" "SRR17060830_trim_2.fq.gz"
## [31] "SRR17060831_trim_1.fq.gz" "SRR17060831_trim_2.fq.gz"
## [33] "SRR17060832_trim_1.fq.gz" "SRR17060832_trim_2.fq.gz"
## [35] "SRR17060833_trim_1.fq.gz" "SRR17060833_trim_2.fq.gz"
## [37] "SRR17060834_trim_1.fq.gz" "SRR17060834_trim_2.fq.gz"
## [39] "SRR17060835_trim_1.fq.gz" "SRR17060835_trim_2.fq.gz"
## [41] "SRR17060836_trim_1.fq.gz" "SRR17060836_trim_2.fq.gz"
## [43] "SRR17060837_trim_1.fq.gz" "SRR17060837_trim_2.fq.gz"
## [45] "SRR17060838_trim_1.fq.gz" "SRR17060838_trim_2.fq.gz"
## [47] "SRR17060839_trim_1.fq.gz" "SRR17060839_trim_2.fq.gz"
## [49] "SRR17060840_trim_1.fq.gz" "SRR17060840_trim_2.fq.gz"
## [51] "SRR17060841_trim_1.fq.gz" "SRR17060841_trim_2.fq.gz"
## [53] "SRR17060842_trim_1.fq.gz" "SRR17060842_trim_2.fq.gz"
## [55] "SRR17060843_trim_1.fq.gz" "SRR17060843_trim_2.fq.gz"
## [57] "SRR17060844_trim_1.fq.gz" "SRR17060844_trim_2.fq.gz"
## [59] "SRR17060845_trim_1.fq.gz" "SRR17060845_trim_2.fq.gz"
## [61] "SRR17060846_trim_1.fq.gz" "SRR17060846_trim_2.fq.gz"
## [63] "SRR17060847_trim_1.fq.gz" "SRR17060847_trim_2.fq.gz"
## chr [1:64] "SRR17060816_trim_1.fq.gz" "SRR17060816_trim_2.fq.gz" ...
#Create a vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "_trim_1.fq.gz", full.names = TRUE)
#Intuition check
head(forward_reads)## [1] "data/01_DADA2/00_trimmed_fastq/SRR17060816_trim_1.fq.gz"
## [2] "data/01_DADA2/00_trimmed_fastq/SRR17060817_trim_1.fq.gz"
## [3] "data/01_DADA2/00_trimmed_fastq/SRR17060818_trim_1.fq.gz"
## [4] "data/01_DADA2/00_trimmed_fastq/SRR17060819_trim_1.fq.gz"
## [5] "data/01_DADA2/00_trimmed_fastq/SRR17060820_trim_1.fq.gz"
## [6] "data/01_DADA2/00_trimmed_fastq/SRR17060821_trim_1.fq.gz"
#Create a vector of reverse reads
reverse_reads <-list.files(raw_fastqs_path, pattern = "_trim_2.fq.gz", full.names = TRUE)
#Intuition check
head(reverse_reads)## [1] "data/01_DADA2/00_trimmed_fastq/SRR17060816_trim_2.fq.gz"
## [2] "data/01_DADA2/00_trimmed_fastq/SRR17060817_trim_2.fq.gz"
## [3] "data/01_DADA2/00_trimmed_fastq/SRR17060818_trim_2.fq.gz"
## [4] "data/01_DADA2/00_trimmed_fastq/SRR17060819_trim_2.fq.gz"
## [5] "data/01_DADA2/00_trimmed_fastq/SRR17060820_trim_2.fq.gz"
## [6] "data/01_DADA2/00_trimmed_fastq/SRR17060821_trim_2.fq.gz"
# Randomly select 12 samples from dataset to evaluate
# Selecting 12 is typically better than 2 (like we did in class for efficiency)
random_samples <- sample(1:length(reverse_reads), size = 12)
random_samples## [1] 16 22 15 1 14 6 30 27 11 13 23 32
# Calculate and plot quality of these two samples
forward_filteredQual_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
labs(title = "Forward Read: Raw Quality")
reverse_filteredQual_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
labs(title = "Reverse Read: Raw Quality")
# Plot them together with patchwork
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_preQC_plot <-
plotQualityProfile(forward_reads, aggregate = TRUE) +
labs(title = "Forward Pre-QC")
# reverse reads
reverse_preQC_plot <-
plotQualityProfile(reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Pre-QC")
preQC_aggregate_plot <-
# Plot the forward and reverse together
forward_preQC_plot + reverse_preQC_plot
# Show the plot
preQC_aggregate_plot# vector of our samples, extract the sample information from our file
samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1)
#Intuition check
head(samples)## [1] "SRR17060816" "SRR17060817" "SRR17060818" "SRR17060819" "SRR17060820"
## [6] "SRR17060821"
#place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "data/01_DADA2/02_filtered_fastqs"
filtered_fastqs_path## [1] "data/01_DADA2/02_filtered_fastqs"
# create 2 variables : filtered_F, filtered_R
filtered_forward_reads <-
file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
#Intuition check
head(filtered_forward_reads)## [1] "data/01_DADA2/02_filtered_fastqs/SRR17060816_R1_filtered.fastq.gz"
## [2] "data/01_DADA2/02_filtered_fastqs/SRR17060817_R1_filtered.fastq.gz"
## [3] "data/01_DADA2/02_filtered_fastqs/SRR17060818_R1_filtered.fastq.gz"
## [4] "data/01_DADA2/02_filtered_fastqs/SRR17060819_R1_filtered.fastq.gz"
## [5] "data/01_DADA2/02_filtered_fastqs/SRR17060820_R1_filtered.fastq.gz"
## [6] "data/01_DADA2/02_filtered_fastqs/SRR17060821_R1_filtered.fastq.gz"
## [1] 32
filtered_reverse_reads <- file.path(filtered_fastqs_path, paste0(samples,
"_R2_filtered.fastq.gz"))
#Intuition check
length(filtered_reverse_reads)## [1] 32
Parameters of filter and trim DEPEND ON THE DATASET
maxN = number of N bases. Remove all Ns from the
data.maxEE = quality filtering threshold applied to expected
errors. By default, all expected errors. Mar recommends using c(1,1).
Here, if there is maxEE expected errors, its okay. If more, throw away
sequence.trimLeft = trim certain number of base pairs on start
of each readtruncQ = truncate reads at the first instance of a
quality score less than or equal to selected number. Chose 2rm.phix = remove phi xcompress = make filtered files .gzippedmultithread = multithread#Assign a vector to filtered reads
#Trim out poor bases
#Write out filtered fastq files
filtered_reads <-
filterAndTrim(fwd = forward_reads, filt = filtered_forward_reads,
rev = reverse_reads, filt.rev = filtered_reverse_reads,
maxN = 0, maxEE = c(2, 2),truncQ = 2, rm.phix = TRUE,
compress = TRUE, multithread = 6)# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <-
plotQualityProfile(filtered_forward_reads[random_samples]) +
labs(title = "Trimmed Forward Read Quality")
reverse_filteredQual_plot_12 <-
plotQualityProfile(filtered_reverse_reads[random_samples]) +
labs(title = "Trimmed Reverse Read Quality")
# Put the two plots together
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_postQC_plot <-
plotQualityProfile(filtered_forward_reads, aggregate = TRUE) +
labs(title = "Forward Post-QC")
# reverse reads
reverse_postQC_plot <-
plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Post-QC")
postQC_aggregate_plot <-
# Plot the forward and reverse together
forward_postQC_plot + reverse_postQC_plot
# Show the plot
postQC_aggregate_plotfilterAndTrim## reads.in reads.out
## SRR17060816_trim_1.fq.gz 285558 19369
## SRR17060817_trim_1.fq.gz 676817 10436
## SRR17060818_trim_1.fq.gz 591364 12909
## SRR17060819_trim_1.fq.gz 379452 19735
## SRR17060820_trim_1.fq.gz 570270 17813
## SRR17060821_trim_1.fq.gz 556682 19371
# calculate some stats
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained = (median(reads.out)/median(reads.in)))## median_reads_in median_reads_out median_percent_retained
## 1 294748.5 18591 0.06307411
[Insert paragraph interpreting the results above]
filterAndTrim()
more? If so, which parameters?Note every sequencing run needs to be run
separately! The error model MUST be run separately on
each illumina dataset. If you’d like to combine the datasets from
multiple sequencing runs, you’ll need to do the exact same
filterAndTrim() step AND, very importantly, you’ll
need to have the same primer and ASV length expected by the output.
Infer error rates for all possible transitions within purines and pyrimidines (A<>G or C<>T) and transversions between all purine and pyrimidine combinations.
Error model is learned by alternating estimation of the error rates and inference of sample composition until they converge.
## 94945475 total bases in 1027632 reads from 32 samples will be used for learning the error rates.
#Plot forward reads errors
forward_error_plot <-
plotErrors(error_forward_reads, nominalQ = TRUE) +
labs(title = "Forward Read Error Model")
#Reverse reads
error_reverse_reads <-
learnErrors(filtered_reverse_reads, multithread = TRUE)## 103680356 total bases in 400060 reads from 11 samples will be used for learning the error rates.
#Plot reverse reads errors
reverse_error_plot <-
plotErrors(error_reverse_reads, nominalQ = TRUE) +
labs(title = "Reverse Read Error Model")
#Put the two plots together
forward_error_plot + reverse_error_plot## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
[Insert paragraph interpreting the plot above above]
Details of the plot: - Points: The observed error
rates for each consensus quality score.
- Black line: Estimated error rates after convergence
of the machine-learning algorithm.
- Red line: The error rates expected under the nominal
definition of the Q-score.
Similar to what is mentioned in the dada2 tutorial: the estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop with increased quality as expected. We can now infer ASVs!
An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.
#Infer forward ASVs
dada_forward <- dada(filtered_forward_reads,
err = error_forward_reads,
multithread = 6)## Sample 1 - 19369 reads in 4684 unique sequences.
## Sample 2 - 10436 reads in 887 unique sequences.
## Sample 3 - 12909 reads in 1306 unique sequences.
## Sample 4 - 19735 reads in 3005 unique sequences.
## Sample 5 - 17813 reads in 1854 unique sequences.
## Sample 6 - 19371 reads in 1484 unique sequences.
## Sample 7 - 72573 reads in 31057 unique sequences.
## Sample 8 - 104922 reads in 48332 unique sequences.
## Sample 9 - 12184 reads in 1582 unique sequences.
## Sample 10 - 59104 reads in 25836 unique sequences.
## Sample 11 - 51644 reads in 23771 unique sequences.
## Sample 12 - 41320 reads in 17526 unique sequences.
## Sample 13 - 63851 reads in 25596 unique sequences.
## Sample 14 - 1913 reads in 1405 unique sequences.
## Sample 15 - 1200 reads in 405 unique sequences.
## Sample 16 - 14485 reads in 1891 unique sequences.
## Sample 17 - 8353 reads in 1179 unique sequences.
## Sample 18 - 16945 reads in 2122 unique sequences.
## Sample 19 - 21399 reads in 3316 unique sequences.
## Sample 20 - 38840 reads in 4176 unique sequences.
## Sample 21 - 12576 reads in 933 unique sequences.
## Sample 22 - 10763 reads in 2248 unique sequences.
## Sample 23 - 17402 reads in 1298 unique sequences.
## Sample 24 - 2269 reads in 1328 unique sequences.
## Sample 25 - 89361 reads in 28320 unique sequences.
## Sample 26 - 69242 reads in 19628 unique sequences.
## Sample 27 - 65484 reads in 22451 unique sequences.
## Sample 28 - 63625 reads in 25978 unique sequences.
## Sample 29 - 61755 reads in 31554 unique sequences.
## Sample 30 - 3879 reads in 1862 unique sequences.
## Sample 31 - 16616 reads in 2413 unique sequences.
## Sample 32 - 6294 reads in 1049 unique sequences.
#Infer reverse ASVs
dada_reverse <- dada(filtered_reverse_reads,
err = error_reverse_reads,
multithread = 6)## Sample 1 - 19369 reads in 11473 unique sequences.
## Sample 2 - 10436 reads in 5249 unique sequences.
## Sample 3 - 12909 reads in 6265 unique sequences.
## Sample 4 - 19735 reads in 11898 unique sequences.
## Sample 5 - 17813 reads in 8748 unique sequences.
## Sample 6 - 19371 reads in 9023 unique sequences.
## Sample 7 - 72573 reads in 37738 unique sequences.
## Sample 8 - 104922 reads in 66028 unique sequences.
## Sample 9 - 12184 reads in 6809 unique sequences.
## Sample 10 - 59104 reads in 35504 unique sequences.
## Sample 11 - 51644 reads in 30183 unique sequences.
## Sample 12 - 41320 reads in 22054 unique sequences.
## Sample 13 - 63851 reads in 37043 unique sequences.
## Sample 14 - 1913 reads in 1560 unique sequences.
## Sample 15 - 1200 reads in 945 unique sequences.
## Sample 16 - 14485 reads in 8617 unique sequences.
## Sample 17 - 8353 reads in 5043 unique sequences.
## Sample 18 - 16945 reads in 9082 unique sequences.
## Sample 19 - 21399 reads in 12657 unique sequences.
## Sample 20 - 38840 reads in 19949 unique sequences.
## Sample 21 - 12576 reads in 5822 unique sequences.
## Sample 22 - 10763 reads in 7404 unique sequences.
## Sample 23 - 17402 reads in 8388 unique sequences.
## Sample 24 - 2269 reads in 1547 unique sequences.
## Sample 25 - 89361 reads in 47195 unique sequences.
## Sample 26 - 69242 reads in 34236 unique sequences.
## Sample 27 - 65484 reads in 28196 unique sequences.
## Sample 28 - 63625 reads in 31398 unique sequences.
## Sample 29 - 61755 reads in 41082 unique sequences.
## Sample 30 - 3879 reads in 2811 unique sequences.
## Sample 31 - 16616 reads in 9968 unique sequences.
## Sample 32 - 6294 reads in 4068 unique sequences.
## $SRR17060816_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 159 sequence variants were inferred from 4684 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060816_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 119 sequence variants were inferred from 11473 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060827_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 34 sequence variants were inferred from 17526 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060827_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 58 sequence variants were inferred from 22054 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
Now, merge the forward and reverse ASVs into contigs.
# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads,
dada_reverse, filtered_reverse_reads,
verbose = TRUE)## 16941 paired-reads (in 98 unique pairings) successfully merged out of 18810 (in 233 pairings) input.
## 9266 paired-reads (in 37 unique pairings) successfully merged out of 10109 (in 94 pairings) input.
## 11834 paired-reads (in 36 unique pairings) successfully merged out of 12660 (in 78 pairings) input.
## 17079 paired-reads (in 128 unique pairings) successfully merged out of 19133 (in 277 pairings) input.
## 15621 paired-reads (in 48 unique pairings) successfully merged out of 17558 (in 124 pairings) input.
## 17366 paired-reads (in 54 unique pairings) successfully merged out of 19009 (in 104 pairings) input.
## 65499 paired-reads (in 93 unique pairings) successfully merged out of 72330 (in 372 pairings) input.
## 93646 paired-reads (in 274 unique pairings) successfully merged out of 103650 (in 1327 pairings) input.
## 10064 paired-reads (in 48 unique pairings) successfully merged out of 11984 (in 120 pairings) input.
## 51253 paired-reads (in 86 unique pairings) successfully merged out of 58370 (in 522 pairings) input.
## 46771 paired-reads (in 140 unique pairings) successfully merged out of 50638 (in 621 pairings) input.
## 39570 paired-reads (in 40 unique pairings) successfully merged out of 40948 (in 193 pairings) input.
## 54885 paired-reads (in 64 unique pairings) successfully merged out of 63441 (in 316 pairings) input.
## 923 paired-reads (in 22 unique pairings) successfully merged out of 1687 (in 78 pairings) input.
## 796 paired-reads (in 26 unique pairings) successfully merged out of 881 (in 40 pairings) input.
## 12166 paired-reads (in 88 unique pairings) successfully merged out of 14071 (in 175 pairings) input.
## 6896 paired-reads (in 59 unique pairings) successfully merged out of 7941 (in 124 pairings) input.
## 14846 paired-reads (in 52 unique pairings) successfully merged out of 16743 (in 125 pairings) input.
## 17644 paired-reads (in 120 unique pairings) successfully merged out of 19708 (in 284 pairings) input.
## 35051 paired-reads (in 159 unique pairings) successfully merged out of 38131 (in 353 pairings) input.
## 11778 paired-reads (in 28 unique pairings) successfully merged out of 12405 (in 64 pairings) input.
## 7633 paired-reads (in 91 unique pairings) successfully merged out of 9111 (in 196 pairings) input.
## 15736 paired-reads (in 38 unique pairings) successfully merged out of 17248 (in 103 pairings) input.
## 1968 paired-reads (in 15 unique pairings) successfully merged out of 2093 (in 33 pairings) input.
## 84671 paired-reads (in 92 unique pairings) successfully merged out of 88446 (in 509 pairings) input.
## 67310 paired-reads (in 69 unique pairings) successfully merged out of 68852 (in 200 pairings) input.
## 52179 paired-reads (in 65 unique pairings) successfully merged out of 65186 (in 249 pairings) input.
## 51276 paired-reads (in 80 unique pairings) successfully merged out of 63047 (in 371 pairings) input.
## 53493 paired-reads (in 209 unique pairings) successfully merged out of 59865 (in 1246 pairings) input.
## 3446 paired-reads (in 23 unique pairings) successfully merged out of 3775 (in 44 pairings) input.
## 12776 paired-reads (in 66 unique pairings) successfully merged out of 16327 (in 170 pairings) input.
## 4702 paired-reads (in 34 unique pairings) successfully merged out of 6044 (in 92 pairings) input.
## [1] "list"
## [1] 32
## [1] "SRR17060816_R1_filtered.fastq.gz" "SRR17060817_R1_filtered.fastq.gz"
## [3] "SRR17060818_R1_filtered.fastq.gz" "SRR17060819_R1_filtered.fastq.gz"
## [5] "SRR17060820_R1_filtered.fastq.gz" "SRR17060821_R1_filtered.fastq.gz"
## [7] "SRR17060822_R1_filtered.fastq.gz" "SRR17060823_R1_filtered.fastq.gz"
## [9] "SRR17060824_R1_filtered.fastq.gz" "SRR17060825_R1_filtered.fastq.gz"
## [11] "SRR17060826_R1_filtered.fastq.gz" "SRR17060827_R1_filtered.fastq.gz"
## [13] "SRR17060828_R1_filtered.fastq.gz" "SRR17060829_R1_filtered.fastq.gz"
## [15] "SRR17060830_R1_filtered.fastq.gz" "SRR17060831_R1_filtered.fastq.gz"
## [17] "SRR17060832_R1_filtered.fastq.gz" "SRR17060833_R1_filtered.fastq.gz"
## [19] "SRR17060834_R1_filtered.fastq.gz" "SRR17060835_R1_filtered.fastq.gz"
## [21] "SRR17060836_R1_filtered.fastq.gz" "SRR17060837_R1_filtered.fastq.gz"
## [23] "SRR17060838_R1_filtered.fastq.gz" "SRR17060839_R1_filtered.fastq.gz"
## [25] "SRR17060840_R1_filtered.fastq.gz" "SRR17060841_R1_filtered.fastq.gz"
## [27] "SRR17060842_R1_filtered.fastq.gz" "SRR17060843_R1_filtered.fastq.gz"
## [29] "SRR17060844_R1_filtered.fastq.gz" "SRR17060845_R1_filtered.fastq.gz"
## [31] "SRR17060846_R1_filtered.fastq.gz" "SRR17060847_R1_filtered.fastq.gz"
# Create the ASV Count Table
raw_ASV_table <- makeSequenceTable(merged_ASVs)
# Write out the file to data/01_DADA2
# Check the type and dimensions of the data
dim(raw_ASV_table)## [1] 32 1370
## [1] "matrix" "array"
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table)))##
## 114 152 171 186 199 215 226 228 245 252 253 254 281 282
## 1 2 1 6 1 1 1 1 1 97 565 18 619 56
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Raw distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###################################################
###################################################
# TRIM THE ASVS
# Let's trim the ASVs to only be the right size, which is 249.
# 249 originates from our expected amplicon of 252 - 3bp in the forward read due to low quality.
# We will allow for a few
raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 253]
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table_trimmed)))##
## 253
## 565
## [1] 0.6304546
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Note the peak at 249 is ABOVE 3000
# Let's zoom in on the plot
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length") +
scale_y_continuous(limits = c(0, 500))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
Taking into account the lower, zoomed-in plot. Do we want to remove those extra ASVs?
Sometimes chimeras arise in our workflow.
Chimeric sequences are artificial sequences formed by the combination of two or more distinct biological sequences. These chimeric sequences can arise during the polymerase chain reaction (PCR) amplification step of the 16S rRNA gene, where fragments from different templates can be erroneously joined together.
Chimera removal is an essential step in the analysis of 16S sequencing data to improve the accuracy of downstream analyses, such as taxonomic assignment and diversity assessment. It helps to avoid the inclusion of misleading or spurious sequences that could lead to incorrect biological interpretations.
# Remove the chimeras in the raw ASV table
noChimeras_ASV_table <- removeBimeraDenovo(raw_ASV_table_trimmed,
method="consensus",
multithread=TRUE, verbose=TRUE)## Identified 62 bimeras out of 565 input sequences.
## [1] 32 503
## [1] 0.9879569
## [1] 0.6228619
# Plot it
data.frame(Seq_Length_NoChim = nchar(getSequences(noChimeras_ASV_table))) %>%
ggplot(aes(x = Seq_Length_NoChim )) +
geom_histogram()+
labs(title = "Trimmed + Chimera Removal distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here, we will look at the number of reads that were lost in the filtering, denoising, merging, and chimera removal.
# A little function to identify number seqs
getN <- function(x) sum(getUniques(x))
# Make the table to track the seqs
track <- cbind(filtered_reads,
sapply(dada_forward, getN),
sapply(dada_reverse, getN),
sapply(merged_ASVs, getN),
rowSums(noChimeras_ASV_table))
head(track)## reads.in reads.out
## SRR17060816_trim_1.fq.gz 285558 19369 19220 18883 16941 0
## SRR17060817_trim_1.fq.gz 676817 10436 10358 10126 9266 0
## SRR17060818_trim_1.fq.gz 591364 12909 12843 12683 11834 0
## SRR17060819_trim_1.fq.gz 379452 19735 19557 19234 17079 0
## SRR17060820_trim_1.fq.gz 570270 17813 17698 17645 15621 0
## SRR17060821_trim_1.fq.gz 556682 19371 19190 19138 17366 0
# Update column names to be more informative (most are missing at the moment!)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples
# Generate a dataframe to track the reads through our DADA2 pipeline
track_counts_df <-
track %>%
# make it a dataframe
as.data.frame() %>%
rownames_to_column(var = "names") %>%
mutate(perc_reads_retained = 100 * nochim / input)
# Visualize it in table format
DT::datatable(track_counts_df)# Plot it!
track_counts_df %>%
pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
mutate(read_type = fct_relevel(read_type,
"input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
ggplot(aes(x = read_type, y = num_reads, fill = read_type)) +
geom_line(aes(group = names), color = "grey") +
geom_point(shape = 21, size = 3, alpha = 0.8) +
scale_fill_brewer(palette = "Spectral") +
labs(x = "Filtering Step", y = "Number of Sequences") +
theme_bw()Here, we will use the silva database version 138!
# The next line took 2 mins to run
taxa_train <-
assignTaxonomy(noChimeras_ASV_table,
"/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz",
multithread=TRUE)
# the next line took 3 minutes
taxa_addSpecies <-
addSpecies(taxa_train,
"/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")
# Inspect the taxonomy
taxa_print <- taxa_addSpecies # Removing sequence rownames for display only
rownames(taxa_print) <- NULL
#View(taxa_print)Below, we will prepare the following:
ASV_fastas: A fasta file that we can use to build a
tree for phylogenetic analyses (e.g. phylogenetic alpha diversity
metrics or UNIFRAC dissimilarty).########### 2. COUNT TABLE ###############
############## Modify the ASV names and then save a fasta file! ##############
# Give headers more manageable names
# First pull the ASV sequences
asv_seqs <- colnames(noChimeras_ASV_table)
asv_seqs[1:5]## [1] "TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG"
## [2] "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGAGGCACGAAAGCGTGGGGATCAAACAGG"
## [3] "TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACAGG"
## [4] "TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATTGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG"
## [5] "TACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATAGG"
# make headers for our ASV seq fasta file, which will be our asv names
asv_headers <- vector(dim(noChimeras_ASV_table)[2], mode = "character")
asv_headers[1:5]## [1] "" "" "" "" ""
# loop through vector and fill it in with ASV names
for (i in 1:dim(noChimeras_ASV_table)[2]) {
asv_headers[i] <- paste(">ASV", i, sep = "_")
}
# intitution check
asv_headers[1:5]## [1] ">ASV_1" ">ASV_2" ">ASV_3" ">ASV_4" ">ASV_5"
# Inspect the taxonomy table
#View(taxa_addSpecies)
##### Prepare tax table
# Add the ASV sequences from the rownames to a column
new_tax_tab <-
taxa_addSpecies%>%
as.data.frame() %>%
rownames_to_column(var = "ASVseqs")
head(new_tax_tab)## ASVseqs
## 1 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## 2 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGAGGCACGAAAGCGTGGGGATCAAACAGG
## 3 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACAGG
## 4 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATTGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## 5 TACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATAGG
## 6 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGATGCACGAAAGCGTGGGGATCAAACAGG
## Kingdom Phylum Class Order
## 1 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## 2 Bacteria Bacteroidota Bacteroidia Bacteroidales
## 3 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## 4 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## 5 Bacteria Spirochaetota Brevinematia Brevinematales
## 6 Bacteria Bacteroidota Bacteroidia Bacteroidales
## Family Genus Species
## 1 Vibrionaceae Enterovibrio <NA>
## 2 Tannerellaceae Macellibacteroides <NA>
## 3 Endozoicomonadaceae Endozoicomonas <NA>
## 4 Vibrionaceae Enterovibrio <NA>
## 5 Brevinemataceae Brevinema <NA>
## 6 Tannerellaceae Macellibacteroides <NA>
# intution check
stopifnot(new_tax_tab$ASVseqs == colnames(noChimeras_ASV_table))
# Now let's add the ASV names
rownames(new_tax_tab) <- rownames(asv_tab)
head(new_tax_tab)## ASVseqs
## ASV_1 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## ASV_2 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGAGGCACGAAAGCGTGGGGATCAAACAGG
## ASV_3 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACAGG
## ASV_4 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATTGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## ASV_5 TACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATAGG
## ASV_6 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGATGCACGAAAGCGTGGGGATCAAACAGG
## Kingdom Phylum Class Order
## ASV_1 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_2 Bacteria Bacteroidota Bacteroidia Bacteroidales
## ASV_3 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## ASV_4 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_5 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_6 Bacteria Bacteroidota Bacteroidia Bacteroidales
## Family Genus Species
## ASV_1 Vibrionaceae Enterovibrio <NA>
## ASV_2 Tannerellaceae Macellibacteroides <NA>
## ASV_3 Endozoicomonadaceae Endozoicomonas <NA>
## ASV_4 Vibrionaceae Enterovibrio <NA>
## ASV_5 Brevinemataceae Brevinema <NA>
## ASV_6 Tannerellaceae Macellibacteroides <NA>
### Final prep of tax table. Add new column with ASV names
asv_tax <-
new_tax_tab %>%
# add rownames from count table for phyloseq handoff
mutate(ASV = rownames(asv_tab)) %>%
# Resort the columns with select
dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, ASVseqs)
head(asv_tax)## Kingdom Phylum Class Order
## ASV_1 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_2 Bacteria Bacteroidota Bacteroidia Bacteroidales
## ASV_3 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## ASV_4 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_5 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_6 Bacteria Bacteroidota Bacteroidia Bacteroidales
## Family Genus Species ASV
## ASV_1 Vibrionaceae Enterovibrio <NA> ASV_1
## ASV_2 Tannerellaceae Macellibacteroides <NA> ASV_2
## ASV_3 Endozoicomonadaceae Endozoicomonas <NA> ASV_3
## ASV_4 Vibrionaceae Enterovibrio <NA> ASV_4
## ASV_5 Brevinemataceae Brevinema <NA> ASV_5
## ASV_6 Tannerellaceae Macellibacteroides <NA> ASV_6
## ASVseqs
## ASV_1 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## ASV_2 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGAGGCACGAAAGCGTGGGGATCAAACAGG
## ASV_3 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACAGG
## ASV_4 TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATTGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACGGG
## ASV_5 TACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATAGG
## ASV_6 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTCTCTTAAGTCAGCGTTGAAAGTTTTCGGCTCAACCGGAAAATTGGCATTGAAACTGGGAGACTTGAGTGTAAATGAAGTTGGCGGAATTCGTTGTGTAGCGGTGAAATGCATAGATATAACGAAGAACTCCGATTGCGAAGGCAGCTGACTAACATACAACTGACGCTGATGCACGAAAGCGTGGGGATCAAACAGG
01_DADA2 filesNow, we will write the files! We will write the following to the
data/01_DADA2/ folder. We will save both as files that
could be submitted as supplements AND as .RData objects for easy loading
into the next steps into R.:
ASV_counts.tsv: ASV count table that has ASV names that
are re-written and shortened headers like ASV_1, ASV_2, etc, which will
match the names in our fasta file below. This will also be saved as
data/01_DADA2/ASV_counts.RData.ASV_counts_withSeqNames.tsv: This is generated with the
data object in this file known as noChimeras_ASV_table. ASV
headers include the entire ASV sequence ~250bps. In addition,
we will save this as a .RData object as
data/01_DADA2/noChimeras_ASV_table.RData as we will use
this data in analysis/02_Taxonomic_Assignment.Rmd to assign
the taxonomy from the sequence headers.ASVs.fasta: A fasta file output of the ASV names from
ASV_counts.tsv and the sequences from the ASVs in
ASV_counts_withSeqNames.tsv. A fasta file that we can use
to build a tree for phylogenetic analyses (e.g. phylogenetic alpha
diversity metrics or UNIFRAC dissimilarty).ASVs.fasta in
data/02_TaxAss_FreshTrain/ to be used for the taxonomy
classification in the next step in the workflow.track_read_counts.RData: To track how many reads we
lost throughout our workflow that could be used and plotted later. We
will add this to the metadata in
analysis/02_Taxonomic_Assignment.Rmd.# FIRST, we will save our output as regular files, which will be useful later on.
# Save to regular .tsv file
# Write BOTH the modified and unmodified ASV tables to a file!
# Write count table with ASV numbered names (e.g. ASV_1, ASV_2, etc)
write.table(asv_tab, "data/01_DADA2/ASV_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write count table with ASV sequence names
write.table(noChimeras_ASV_table, "data/01_DADA2/ASV_counts_withSeqNames.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write out the fasta file for reference later on for what seq matches what ASV
asv_fasta <- c(rbind(asv_headers, asv_seqs))
# Save to a file!
write(asv_fasta, "data/01_DADA2/ASVs.fasta")
# SECOND, let's save the taxonomy tables
# Write the table
write.table(asv_tax, "data/01_DADA2/ASV_taxonomy.tsv", sep = "\t", quote = FALSE, col.names = NA)
# THIRD, let's save to a RData object
# Each of these files will be used in the analysis/02_Taxonomic_Assignment
# RData objects are for easy loading :)
save(noChimeras_ASV_table, file = "data/01_DADA2/noChimeras_ASV_table.RData")
save(asv_tab, file = "data/01_DADA2/ASV_counts.RData")
# And save the track_counts_df a R object, which we will merge with metadata information in the next step of the analysis in nalysis/02_Taxonomic_Assignment.
save(track_counts_df, file = "data/01_DADA2/track_read_counts.RData")##Session information
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os Rocky Linux 9.0 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-04-15
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [2] CRAN (R 4.3.2)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.2)
## ape 5.8 2024-04-11 [1] CRAN (R 4.3.2)
## Biobase 2.62.0 2023-10-24 [2] Bioconductor
## BiocGenerics 0.48.1 2023-11-01 [2] Bioconductor
## BiocParallel 1.36.0 2023-10-24 [2] Bioconductor
## biomformat 1.30.0 2023-10-24 [1] Bioconductor
## Biostrings 2.70.1 2023-10-25 [2] Bioconductor
## bitops 1.0-7 2021-04-24 [2] CRAN (R 4.3.2)
## bslib 0.5.1 2023-08-11 [2] CRAN (R 4.3.2)
## cachem 1.0.8 2023-05-01 [2] CRAN (R 4.3.2)
## callr 3.7.3 2022-11-02 [2] CRAN (R 4.3.2)
## cli 3.6.1 2023-03-23 [2] CRAN (R 4.3.2)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.3.2)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2)
## colorspace 2.1-0 2023-01-23 [2] CRAN (R 4.3.2)
## crayon 1.5.2 2022-09-29 [2] CRAN (R 4.3.2)
## crosstalk 1.2.0 2021-11-04 [2] CRAN (R 4.3.2)
## dada2 * 1.30.0 2023-10-24 [1] Bioconductor
## data.table 1.14.8 2023-02-17 [2] CRAN (R 4.3.2)
## DelayedArray 0.28.0 2023-10-24 [2] Bioconductor
## deldir 1.0-9 2023-05-17 [2] CRAN (R 4.3.2)
## devtools * 2.4.4 2022-07-20 [2] CRAN (R 4.2.1)
## digest 0.6.33 2023-07-07 [2] CRAN (R 4.3.2)
## dplyr * 1.1.3 2023-09-03 [2] CRAN (R 4.3.2)
## DT * 0.32 2024-02-19 [1] CRAN (R 4.3.2)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.3.2)
## evaluate 0.23 2023-11-01 [2] CRAN (R 4.3.2)
## fansi 1.0.5 2023-10-08 [2] CRAN (R 4.3.2)
## farver 2.1.1 2022-07-06 [2] CRAN (R 4.3.2)
## fastmap 1.1.1 2023-02-24 [2] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)
## foreach 1.5.2 2022-02-02 [2] CRAN (R 4.3.2)
## fs 1.6.3 2023-07-20 [2] CRAN (R 4.3.2)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.3.2)
## GenomeInfoDb 1.38.0 2023-10-24 [2] Bioconductor
## GenomeInfoDbData 1.2.11 2023-11-07 [2] Bioconductor
## GenomicAlignments 1.38.0 2023-10-24 [2] Bioconductor
## GenomicRanges 1.54.1 2023-10-29 [2] Bioconductor
## ggplot2 * 3.5.0 2024-02-23 [2] CRAN (R 4.3.2)
## glue 1.6.2 2022-02-24 [2] CRAN (R 4.3.2)
## gtable 0.3.4 2023-08-21 [2] CRAN (R 4.3.2)
## highr 0.10 2022-12-22 [2] CRAN (R 4.3.2)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)
## htmltools 0.5.7 2023-11-03 [2] CRAN (R 4.3.2)
## htmlwidgets 1.6.2 2023-03-17 [2] CRAN (R 4.3.2)
## httpuv 1.6.12 2023-10-23 [2] CRAN (R 4.3.2)
## hwriter 1.3.2.1 2022-04-08 [1] CRAN (R 4.3.2)
## igraph 1.5.1 2023-08-10 [2] CRAN (R 4.3.2)
## interp 1.1-6 2024-01-26 [1] CRAN (R 4.3.2)
## IRanges 2.36.0 2023-10-24 [2] Bioconductor
## iterators 1.0.14 2022-02-05 [2] CRAN (R 4.3.2)
## jpeg 0.1-10 2022-11-29 [1] CRAN (R 4.3.2)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.3.2)
## jsonlite 1.8.7 2023-06-29 [2] CRAN (R 4.3.2)
## knitr 1.45 2023-10-30 [2] CRAN (R 4.3.2)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.3.2)
## later 1.3.1 2023-05-02 [2] CRAN (R 4.3.2)
## lattice 0.21-9 2023-10-01 [2] CRAN (R 4.3.2)
## latticeExtra 0.6-30 2022-07-04 [1] CRAN (R 4.3.2)
## lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.3.2)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.3.2)
## MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
## Matrix 1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
## MatrixGenerics 1.14.0 2023-10-24 [2] Bioconductor
## matrixStats 1.1.0 2023-11-07 [2] CRAN (R 4.3.2)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.3.2)
## mgcv 1.9-0 2023-07-11 [2] CRAN (R 4.3.2)
## mime 0.12 2021-09-28 [2] CRAN (R 4.3.2)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.3.2)
## multtest 2.58.0 2023-10-24 [1] Bioconductor
## munsell 0.5.0 2018-06-12 [2] CRAN (R 4.3.2)
## nlme 3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
## pacman 0.5.1 2019-03-11 [1] CRAN (R 4.3.2)
## patchwork * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
## permute 0.9-7 2022-01-27 [1] CRAN (R 4.3.2)
## phyloseq * 1.41.1 2024-03-09 [1] Github (joey711/phyloseq@c260561)
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.3.2)
## pkgbuild 1.4.2 2023-06-26 [2] CRAN (R 4.3.2)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.3.2)
## pkgload 1.3.3 2023-09-22 [2] CRAN (R 4.3.2)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.3.2)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.3.2)
## prettyunits 1.2.0 2023-09-24 [2] CRAN (R 4.3.2)
## processx 3.8.2 2023-06-30 [2] CRAN (R 4.3.2)
## profvis 0.3.8 2023-05-02 [2] CRAN (R 4.3.2)
## promises 1.2.1 2023-08-10 [2] CRAN (R 4.3.2)
## ps 1.7.5 2023-04-18 [2] CRAN (R 4.3.2)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.3.2)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.3.2)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.3.2)
## Rcpp * 1.0.11 2023-07-06 [2] CRAN (R 4.3.2)
## RcppParallel 5.1.7 2023-02-27 [2] CRAN (R 4.3.2)
## RCurl 1.98-1.13 2023-11-02 [2] CRAN (R 4.3.2)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)
## remotes 2.4.2.1 2023-07-18 [2] CRAN (R 4.3.2)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.3.2)
## rhdf5 2.46.1 2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.2 2023-11-04 [2] CRAN (R 4.3.2)
## rmarkdown 2.25 2023-09-18 [2] CRAN (R 4.3.2)
## Rsamtools 2.18.0 2023-10-24 [2] Bioconductor
## rstudioapi 0.15.0 2023-07-07 [2] CRAN (R 4.3.2)
## S4Arrays 1.2.0 2023-10-24 [2] Bioconductor
## S4Vectors 0.40.1 2023-10-26 [2] Bioconductor
## sass 0.4.7 2023-07-15 [2] CRAN (R 4.3.2)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.3.2)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.3.2)
## shiny 1.7.5.1 2023-10-14 [2] CRAN (R 4.3.2)
## ShortRead 1.60.0 2023-10-24 [1] Bioconductor
## SparseArray 1.2.1 2023-11-05 [2] Bioconductor
## stringi 1.7.12 2023-01-11 [2] CRAN (R 4.3.2)
## stringr * 1.5.0 2022-12-02 [2] CRAN (R 4.3.2)
## SummarizedExperiment 1.32.0 2023-10-24 [2] Bioconductor
## survival 3.5-7 2023-08-14 [2] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.3.2)
## tidyr * 1.3.0 2023-01-24 [2] CRAN (R 4.3.2)
## tidyselect 1.2.0 2022-10-10 [2] CRAN (R 4.3.2)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.2)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.3.2)
## usethis * 2.2.2 2023-07-06 [2] CRAN (R 4.3.2)
## utf8 1.2.4 2023-10-22 [2] CRAN (R 4.3.2)
## vctrs 0.6.4 2023-10-12 [2] CRAN (R 4.3.2)
## vegan 2.6-4 2022-10-11 [1] CRAN (R 4.3.2)
## withr 2.5.2 2023-10-30 [2] CRAN (R 4.3.2)
## xfun 0.41 2023-11-01 [2] CRAN (R 4.3.2)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.3.2)
## XVector 0.42.0 2023-10-24 [2] Bioconductor
## yaml 2.3.7 2023-01-23 [2] CRAN (R 4.3.2)
## zlibbioc 1.48.0 2023-10-24 [2] Bioconductor
##
## [1] /home/cab565/R/x86_64-pc-linux-gnu-library/4.3
## [2] /programs/R-4.3.2/library
##
## ──────────────────────────────────────────────────────────────────────────────